Step 1. Read the data

# Load the data
df <- read_csv("ADAproject_2025_data.csv")
## Rows: 470 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Variant, loanofficer_id
## dbl (20): day, typeI_init, typeI_fin, typeII_init, typeII_fin, agree_init, a...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(df)

Step 2. EDA

# Show the distribution of each variable
plot_list_1 <- list()
numeric_columns_1 <- names(df)[3:length(names(df))]

for (metric in numeric_columns_1) {
  p <- ggplot(df, aes(x = .data[[metric]], fill = Variant)) +
    geom_density(alpha = 0.5) +
    labs(title = paste("Distribution of", metric, "by Variant"),
         x = metric, y = "Density") +
    theme_minimal()
  plot_list_1[[metric]] <- p
} 
print(plot_list_1)
## $day

## 
## $typeI_init

## 
## $typeI_fin

## 
## $typeII_init

## 
## $typeII_fin

## 
## $agree_init

## 
## $agree_fin

## 
## $conflict_init

## 
## $conflict_fin

## 
## $revised_per_ai

## 
## $revised_agst_ai

## 
## $fully_complt

## 
## $confidence_init_total

## 
## $confidence_fin_total

## 
## $complt_init

## 
## $complt_fin

## 
## $ai_typeI

## 
## $ai_typeII

## 
## $badloans_num

## 
## $goodloans_num

## # A tibble: 138 × 22
##    Variant   loanofficer_id   day typeI_init typeI_fin typeII_init typeII_fin
##    <chr>     <chr>          <dbl>      <dbl>     <dbl>       <dbl>      <dbl>
##  1 Treatment qamcqdoe           1          0         0           2          2
##  2 Treatment 09pij0e2           1          3         2           0          1
##  3 Treatment 4cdwcblq           1          1         1           0          1
##  4 Treatment 7bx6hbg5           1          4         2           1          2
##  5 Treatment kmr3oifc           1          2         2           0          1
##  6 Treatment uybljp0c           1          1         1           0          0
##  7 Control   2udootyt           1          2         0           1          0
##  8 Control   2udootyt           2          4         0           1          0
##  9 Control   2udootyt           3          3         0           2          0
## 10 Control   2udootyt           4          3         0           2          0
## # ℹ 128 more rows
## # ℹ 15 more variables: agree_init <dbl>, agree_fin <dbl>, conflict_init <dbl>,
## #   conflict_fin <dbl>, revised_per_ai <dbl>, revised_agst_ai <dbl>,
## #   fully_complt <dbl>, confidence_init_total <dbl>,
## #   confidence_fin_total <dbl>, complt_init <dbl>, complt_fin <dbl>,
## #   ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>, goodloans_num <dbl>
# Find the rows that complt_init > complt_fin
(df_filtered_higher <- df %>% filter(complt_init > complt_fin))
## # A tibble: 96 × 22
##    Variant loanofficer_id   day typeI_init typeI_fin typeII_init typeII_fin
##    <chr>   <chr>          <dbl>      <dbl>     <dbl>       <dbl>      <dbl>
##  1 Control 2udootyt           1          2         0           1          0
##  2 Control 2udootyt           2          4         0           1          0
##  3 Control 2udootyt           3          3         0           2          0
##  4 Control 2udootyt           4          3         0           2          0
##  5 Control 2udootyt           5          3         0           0          0
##  6 Control 2udootyt           6          5         0           0          0
##  7 Control 2udootyt           7          5         0           0          0
##  8 Control 2udootyt           8          4         0           1          0
##  9 Control 2udootyt           9          4         0           0          0
## 10 Control 2udootyt          10          2         0           0          0
## # ℹ 86 more rows
## # ℹ 15 more variables: agree_init <dbl>, agree_fin <dbl>, conflict_init <dbl>,
## #   conflict_fin <dbl>, revised_per_ai <dbl>, revised_agst_ai <dbl>,
## #   fully_complt <dbl>, confidence_init_total <dbl>,
## #   confidence_fin_total <dbl>, complt_init <dbl>, complt_fin <dbl>,
## #   ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>, goodloans_num <dbl>
# Find the rows that complt_init == 10 & complt_fin == 0
(df_filtered_10 <- df %>% filter(complt_init == 10 & complt_fin == 0))
## # A tibble: 82 × 22
##    Variant loanofficer_id   day typeI_init typeI_fin typeII_init typeII_fin
##    <chr>   <chr>          <dbl>      <dbl>     <dbl>       <dbl>      <dbl>
##  1 Control 2udootyt           2          4         0           1          0
##  2 Control 2udootyt           3          3         0           2          0
##  3 Control 2udootyt           4          3         0           2          0
##  4 Control 2udootyt           5          3         0           0          0
##  5 Control 2udootyt           6          5         0           0          0
##  6 Control 2udootyt           7          5         0           0          0
##  7 Control 2udootyt           8          4         0           1          0
##  8 Control 2udootyt           9          4         0           0          0
##  9 Control 2udootyt          10          2         0           0          0
## 10 Control l31kzq2d           3          5         0           2          0
## # ℹ 72 more rows
## # ℹ 15 more variables: agree_init <dbl>, agree_fin <dbl>, conflict_init <dbl>,
## #   conflict_fin <dbl>, revised_per_ai <dbl>, revised_agst_ai <dbl>,
## #   fully_complt <dbl>, confidence_init_total <dbl>,
## #   confidence_fin_total <dbl>, complt_init <dbl>, complt_fin <dbl>,
## #   ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>, goodloans_num <dbl>
# Find the rows that complt_init < complt_fin
(df_filtered_lower <- df %>% filter(complt_init < complt_fin))
## # A tibble: 42 × 22
##    Variant   loanofficer_id   day typeI_init typeI_fin typeII_init typeII_fin
##    <chr>     <chr>          <dbl>      <dbl>     <dbl>       <dbl>      <dbl>
##  1 Treatment qamcqdoe           1          0         0           2          2
##  2 Treatment 09pij0e2           1          3         2           0          1
##  3 Treatment 4cdwcblq           1          1         1           0          1
##  4 Treatment 7bx6hbg5           1          4         2           1          2
##  5 Treatment kmr3oifc           1          2         2           0          1
##  6 Treatment uybljp0c           1          1         1           0          0
##  7 Treatment 92vdohom           1          0         0           0          0
##  8 Treatment vflkw3iq           1          3         2           1          1
##  9 Treatment yc74rzbp           1          3         2           1          3
## 10 Treatment 1ckkyukp           1          1         3           0          2
## # ℹ 32 more rows
## # ℹ 15 more variables: agree_init <dbl>, agree_fin <dbl>, conflict_init <dbl>,
## #   conflict_fin <dbl>, revised_per_ai <dbl>, revised_agst_ai <dbl>,
## #   fully_complt <dbl>, confidence_init_total <dbl>,
## #   confidence_fin_total <dbl>, complt_init <dbl>, complt_fin <dbl>,
## #   ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>, goodloans_num <dbl>

Step 3. Data preparation

Try the treatment_1: Remove the rows that fully_complt != 10, and aggregate the data group by officer id

Data cleaning

# Deal with missing values in complt_fin
df_1 <- df %>% filter(fully_complt == 10)

Data summary

# Summary the data, group by loan officer id
df_summary <- df_1 %>%
  group_by(Variant,loanofficer_id) %>%
  summarise(
    typeI_init_mean = mean(typeI_init),
    typeI_fin_mean = mean(typeI_fin),
    typeII_init_mean = mean(typeII_init),
    typeII_fin_mean = mean(typeII_fin),
    agree_init_mean = mean(agree_init),
    agree_fin_mean = mean(agree_fin),
    conflict_init_mean = mean(conflict_init),
    conflict_fin_mean = mean(conflict_fin),
    confidence_init_total_mean = mean(confidence_init_total),
    confidence_fin_total_mean = mean(confidence_fin_total),
    revised_per_ai_mean = mean(revised_per_ai),
    revised_agst_ai_mean = mean(revised_agst_ai),
    fully_complt_mean = mean(fully_complt),
    complt_init_mean = mean(complt_init),
    complt_fin_mean = mean(complt_fin),
    ai_typeI_mean = mean(ai_typeI),
    ai_typeII_mean = mean(ai_typeII),
    goodloans_num_mean = mean(goodloans_num),
    badloans_num_mean = mean(badloans_num),
    .groups = "drop"
  )
print(df_summary)
## # A tibble: 38 × 21
##    Variant loanofficer_id typeI_init_mean typeI_fin_mean typeII_init_mean
##    <chr>   <chr>                    <dbl>          <dbl>            <dbl>
##  1 Control 0g7pi6g8                  3.44           3.89            1.44 
##  2 Control 0gh7r2hr                  2.33           2.44            1.56 
##  3 Control bzeya726                  2.44           2.33            1.44 
##  4 Control dlpxpwdj                  6              6               0.571
##  5 Control i6miisiq                  5.62           5.5             0.875
##  6 Control p5g1bxa1                  2.89           3.11            1.33 
##  7 Control qwun9ha5                  3.62           3.75            1.25 
##  8 Control sarganjx                  2.22           2.33            1.56 
##  9 Control ugdh6i8o                  4.56           5.22            0.778
## 10 Control uui3fiii                  2.78           2.89            1.22 
## # ℹ 28 more rows
## # ℹ 16 more variables: typeII_fin_mean <dbl>, agree_init_mean <dbl>,
## #   agree_fin_mean <dbl>, conflict_init_mean <dbl>, conflict_fin_mean <dbl>,
## #   confidence_init_total_mean <dbl>, confidence_fin_total_mean <dbl>,
## #   revised_per_ai_mean <dbl>, revised_agst_ai_mean <dbl>,
## #   fully_complt_mean <dbl>, complt_init_mean <dbl>, complt_fin_mean <dbl>,
## #   ai_typeI_mean <dbl>, ai_typeII_mean <dbl>, goodloans_num_mean <dbl>, …

Since aggregating significantly reduces the sample size to 38, we choose to proceed without aggregation to retain more data and ensure better statistical power for analysis.

Try treatment_2: Do not aggregate the data, but remove the rows that fully_complt != 10

Data cleaning

# Deal with missing values in complt_fin
df_2 <- df %>% filter(fully_complt == 10)
print(df_2)
## # A tibble: 330 × 22
##    Variant   loanofficer_id   day typeI_init typeI_fin typeII_init typeII_fin
##    <chr>     <chr>          <dbl>      <dbl>     <dbl>       <dbl>      <dbl>
##  1 Treatment qamcqdoe           2          2         2           3          3
##  2 Treatment qamcqdoe           3          3         3           0          0
##  3 Treatment qamcqdoe           4          1         2           1          1
##  4 Treatment qamcqdoe           5          0         2           0          0
##  5 Treatment qamcqdoe           6          0         1           4          0
##  6 Treatment qamcqdoe           7          0         1           1          0
##  7 Treatment qamcqdoe           8          0         3           4          1
##  8 Treatment qamcqdoe           9          0         1           4          1
##  9 Treatment qamcqdoe          10          0         0           2          1
## 10 Treatment 09pij0e2           2          1         1           3          3
## # ℹ 320 more rows
## # ℹ 15 more variables: agree_init <dbl>, agree_fin <dbl>, conflict_init <dbl>,
## #   conflict_fin <dbl>, revised_per_ai <dbl>, revised_agst_ai <dbl>,
## #   fully_complt <dbl>, confidence_init_total <dbl>,
## #   confidence_fin_total <dbl>, complt_init <dbl>, complt_fin <dbl>,
## #   ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>, goodloans_num <dbl>

Feature engineering

# Create new features
# init error rate
# Recall
df_2$recall_init <- df_2$badloans_num/(df_2$badloans_num + df_2$typeII_init)
df_2$recall_fin <- df_2$badloans_num/(df_2$badloans_num + df_2$typeII_fin)
# Precision
df_2$precision_init <- df_2$badloans_num/(df_2$badloans_num + df_2$typeI_init)
df_2$precision_fin <- df_2$badloans_num/(df_2$badloans_num + df_2$typeI_fin)
# Improvement of recall and precision rate after model
df_2$recall_imp <- df_2$recall_fin-df_2$recall_init
df_2$precision_imp <- df_2$precision_fin-df_2$precision_init
# confilct rate
df_2$conflict_init_rate <- df_2$conflict_init/df_2$complt_init
df_2$conflict_fin_rate <- df_2$conflict_fin/df_2$complt_fin
# decline of conflict rate
df_2$conflict_dec <- df_2$conflict_init_rate-df_2$conflict_fin_rate
# improvement of confidence
df_2$confidence_imp <- df_2$confidence_fin_total-df_2$confidence_init_total

Data transformation

# improvement of confidence(bcs the maximum of this column is 1000, we need to scle confidence_imp (e.g. dividing by 1000) and then log-transforming it)
df_2$log_confidence_imp <- log(df_2$confidence_imp / 1000 + 1)

Data standardization using min-max

# improvement of confidence
df_2$scle_log_confidence_imp <- (df_2$log_confidence_imp - min(df_2$log_confidence_imp)) / (max(df_2$log_confidence_imp) - min(df_2$log_confidence_imp))

Recheck data

# Show the distribution of each variable again
plot_list_3 <- list()
numeric_columns_3 <- names(df_2)[3:length(names(df_2))]

for (metric in numeric_columns_3) {
  p <- ggplot(df_2, aes(x = .data[[metric]], fill = Variant)) +
    geom_density(alpha = 0.5) +
    labs(title = paste("Distribution of", metric, "by Variant"),
         x = metric, y = "Density") +
    theme_minimal()
  plot_list_3[[metric]] <- p
} 
print(plot_list_3)
## $day

## 
## $typeI_init

## 
## $typeI_fin

## 
## $typeII_init

## 
## $typeII_fin

## 
## $agree_init

## 
## $agree_fin

## 
## $conflict_init

## 
## $conflict_fin

## 
## $revised_per_ai

## 
## $revised_agst_ai

## 
## $fully_complt

## 
## $confidence_init_total

## 
## $confidence_fin_total

## 
## $complt_init

## 
## $complt_fin

## 
## $ai_typeI

## 
## $ai_typeII

## 
## $badloans_num

## 
## $goodloans_num

## 
## $recall_init
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_density()`).

## 
## $recall_fin
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_density()`).

## 
## $precision_init
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

## 
## $precision_fin

## 
## $recall_imp
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_density()`).

## 
## $precision_imp
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

## 
## $conflict_init_rate

## 
## $conflict_fin_rate

## 
## $conflict_dec

## 
## $confidence_imp

## 
## $log_confidence_imp

## 
## $scle_log_confidence_imp

Step4. T-test

T-test for treatment_2

t_tests_welch_2 <- list(
  recall_imp = t.test(recall_imp ~ Variant, data = df_2, var.equal = FALSE),
  precision_imp = t.test(precision_imp ~ Variant, data = df_2, var.equal = FALSE),
  conflict_rate_dec = t.test(conflict_dec ~ Variant, data = df_2, var.equal = FALSE),
  confidence_imp = t.test(scle_log_confidence_imp ~ Variant, data = df_2, var.equal = FALSE))

print(t_tests_welch_2)
## $recall_imp
## 
##  Welch Two Sample t-test
## 
## data:  recall_imp by Variant
## t = -4.0118, df = 265.83, p-value = 7.837e-05
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -0.08089470 -0.02763192
## sample estimates:
##   mean in group Control mean in group Treatment 
##             0.001508859             0.055772171 
## 
## 
## $precision_imp
## 
##  Welch Two Sample t-test
## 
## data:  precision_imp by Variant
## t = -3.1907, df = 304.66, p-value = 0.001567
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -0.06778803 -0.01607042
## sample estimates:
##   mean in group Control mean in group Treatment 
##             -0.01683739              0.02509183 
## 
## 
## $conflict_rate_dec
## 
##  Welch Two Sample t-test
## 
## data:  conflict_dec by Variant
## t = -5.1299, df = 246.26, p-value = 5.878e-07
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -0.10590730 -0.04714265
## sample estimates:
##   mean in group Control mean in group Treatment 
##              0.03372093              0.11024590 
## 
## 
## $confidence_imp
## 
##  Welch Two Sample t-test
## 
## data:  scle_log_confidence_imp by Variant
## t = -2.0186, df = 281.22, p-value = 0.04448
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -0.0508259430 -0.0006396344
## sample estimates:
##   mean in group Control mean in group Treatment 
##               0.4164260               0.4421588

Welch t-tests show that Treatment significantly improved recall (p < 0.001) and precision (p < 0.01), indicating better AI-assisted decision-making. Conflict significantly decreased (p < 0.001), meaning loan officers aligned more with AI recommendations. Confidence improved slightly (p < 0.05), but the effect is small.

Build the oec

# Run PCA to determine variable importance in OEC
suppressWarnings(library(FactoMineR))
pca_model <- PCA(df_2[, c("recall_imp", 
                                   "precision_imp", 
                                   "conflict_dec", 
                                   "scle_log_confidence_imp")], 
                 scale.unit = TRUE, graph = FALSE)
## Warning in PCA(df_2[, c("recall_imp", "precision_imp", "conflict_dec",
## "scle_log_confidence_imp")], : Missing values are imputed by the mean of the
## variable: you should use the imputePCA function of the missMDA package
print(pca_model$eig)
##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1  1.3909256               34.77314                          34.77314
## comp 2  1.1556634               28.89159                          63.66473
## comp 3  0.9397291               23.49323                          87.15795
## comp 4  0.5136819               12.84205                         100.00000

According to the PCA result, we can find: PC1 (recall_imp) explains 34.77% of the variance, indicating that the data has the highest variability in this direction. PC2 (precision_imp) explains 28.9% of the variance, which together with PC1 explains a total of 63.66% of the variability. PC3 (conflict_dec) explains 23.49% and cumulatively explains 87.16% of the variability in the data. PC4 (scle_log_confidence_imp) explains only 12.84% and is not significant.

# using recall_imp, precision_imp, conflict_dec and scle_log_confidence_imp
# setting the weight according to PCA result
w1 <- 0.3477 # weight of recall_imp
w2 <- 0.289 # weight of precision_imp
w3 <- 0.2349 # weight of conflict_dec
w4 <- 0.1284 #weight of scle_log_confidence_imp

Compute the oec

df_2$oec <- w1*df_2$recall_imp + w2*df_2$precision_imp + w3*df_2$conflict_dec + w4*df_2$scle_log_confidence_imp
# oec_1 t-test
t.test(oec ~ Variant, data = df_2, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  oec by Variant
## t = -6.7623, df = 293.79, p-value = 7.327e-11
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -0.06921005 -0.03800635
## sample estimates:
##   mean in group Control mean in group Treatment 
##              0.05704877              0.11065697

Welch t-test shows that Treatment significantly improved the Overall Evaluation Criterion (OEC) (p < 0.001), with the Treatment group (mean = 0.1107) performing notably better than the Control group (mean = 0.0570). The 95% confidence interval [-0.0692, -0.0380] confirms a meaningful positive impact of AI intervention.

Compute Difference in OEC and each variables (Actual Value & %) between Variants

pairwise_diff <- df_2 %>%
  group_by(Variant) %>%
  summarise(
    OEC_mean = mean(oec, na.rm = TRUE),
    recall_imp_mean = mean(recall_imp, na.rm = TRUE),
    precision_imp_mean = mean(precision_imp, na.rm = TRUE),
    conflict_dec_mean = mean(conflict_dec, na.rm = TRUE),
    scle_log_confidence_imp_mean = mean(scle_log_confidence_imp, na.rm = TRUE)) %>%
  summarise(
    Diff_Treatment_Control = OEC_mean[Variant == "Treatment"] - OEC_mean[Variant == "Control"],
    Diff_recall_imp = recall_imp_mean[Variant == "Treatment"] - recall_imp_mean[Variant == "Control"],
    Diff_precision = precision_imp_mean[Variant == "Treatment"] - precision_imp_mean[Variant == "Control"],
    Diff_Conflict = conflict_dec_mean[Variant == "Treatment"] - conflict_dec_mean[Variant == "Control"],
    Diff_Confidence = scle_log_confidence_imp_mean[Variant == "Treatment"] - scle_log_confidence_imp_mean[Variant == "Control"],
    Perc_Treatment_Control = (Diff_Treatment_Control / abs(OEC_mean[Variant == "Control"])) * 100,
    Perc_recall = (Diff_recall_imp / abs(recall_imp_mean[Variant == "Control"])) * 100,
    Perc_precision = (Diff_precision / abs(precision_imp_mean[Variant == "Control"])) * 100,
    Perc_Conflict = (Diff_Conflict / abs(conflict_dec_mean[Variant == "Control"])) * 100,
    Perc_Confidence = (Diff_Confidence / abs(scle_log_confidence_imp_mean[Variant == "Control"])) * 100
  )

# View pairwise differences
print(pairwise_diff)
## # A tibble: 1 × 10
##   Diff_Treatment_Control Diff_recall_imp Diff_precision Diff_Conflict
##                    <dbl>           <dbl>          <dbl>         <dbl>
## 1                 0.0536          0.0543         0.0419        0.0765
## # ℹ 6 more variables: Diff_Confidence <dbl>, Perc_Treatment_Control <dbl>,
## #   Perc_recall <dbl>, Perc_precision <dbl>, Perc_Conflict <dbl>,
## #   Perc_Confidence <dbl>

According to the result, we can find: Treatment significantly improved the OEC (p < 0.001) compared to Control by 93.9%. Reduced conflict and increased confidence indicate that the new Treatment methodology has improved the efficiency and stability of the approval process.

visualization for oec

# show the absolute differences plot
 ggplot(pairwise_diff) +
  geom_bar(aes(x = "OEC", y = Diff_Treatment_Control), stat = "identity", fill = "steelblue") +
  geom_bar(aes(x = "Recall", y = Diff_recall_imp), stat = "identity", fill = "steelblue") +
  geom_bar(aes(x = "Precision", y = Diff_precision), stat = "identity", fill = "steelblue") +
  geom_bar(aes(x = "Conflict", y = Diff_Conflict), stat = "identity", fill = "steelblue") +
  geom_bar(aes(x = "Confidence", y = Diff_Confidence), stat = "identity", fill = "steelblue") +
  labs(title = "Pairwise Differences (Absolute)", x = "Metric", y = "Difference") +
  theme_minimal()

# show the percentage differences plot
ggplot(pairwise_diff) +
  geom_bar(aes(x = "OEC", y = Perc_Treatment_Control), stat = "identity", fill = "steelblue") +
  geom_bar(aes(x = "Recall", y = Perc_recall), stat = "identity", fill = "steelblue") +
  geom_bar(aes(x = "Precision", y = Perc_precision), stat = "identity", fill = "steelblue") +
  geom_bar(aes(x = "Conflict", y = Perc_Conflict), stat = "identity", fill = "steelblue") +
  geom_bar(aes(x = "Confidence", y = Perc_Confidence), stat = "identity", fill = "steelblue") +
  labs(title = "Pairwise Differences (Percentage)", x = "Metric", y = "Percentage Change (%)") +
  theme_minimal()

Compute the effect size

Control = df_2$oec[df_2$Variant == "Control"]
Treatment = df_2$oec[df_2$Variant == "Treatment"]

cohens_d(Treatment, Control) # compute effect size of difference between Treatment & Control
## Warning: Missing values detected. NAs dropped.
## Cohen's d |       95% CI
## ------------------------
## 0.65      | [0.39, 0.90]
## 
## - Estimated using pooled SD.
effectsize::interpret_cohens_d(0.65)
## [1] "medium"
## (Rules: cohen1988)

Treatment significantly improved recall (p < 0.001, d = 0.65) compared to Control by 93.9%.
The positive impact of AI on recall and precision has both statistical and practical significance, considering the p-value, Cohen’s d measure, and percentage change.
Thus, the AI model should be implemented given the statistically significant and practically meaningful improvements in decision accuracy and alignment with AI recommendations.

Power Analysis

pwr.t.test(power = .8, # 80% power
           d = 0.5, # Cohen's d 
           sig.level = 0.05, # threshold for p-val
           type = "two.sample") 
## 
##      Two-sample t test power calculation 
## 
##               n = 63.76561
##               d = 0.5
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Power analysis indicates that with a moderate effect size (Cohen’s d = 0.5), a sample size of approximately 64 per group is required to achieve 80% power at a 5% significance level. This means the study has an 80% chance of detecting a true effect if it exists, while keeping the risk of a Type I error at 5%. If the actual sample size is smaller, the study may lack sufficient power to detect moderate effects reliably.